options(scipen=999)
pkgs = c('data.table', 'magrittr', 'caret', 'glmnet', 'ranger', 'plotly', 'iml')
inst = lapply(pkgs, library, character.only=TRUE)
source('~/Documents/R/Utils/functions/summarize.R')
source('~/Documents/R/Utils/functions/model_auc.R')
source('~/Documents/R/Utils/functions/feature_comparison.R')
source('~/Documents/R/Utils/functions/helper.R')
data = fread("~/Documents/R/churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
seed = 2018
head(data)data[,.N,.(OnlineSecurity=="No internet service",
TechSupport=="No internet service",
InternetService=="No",
DeviceProtection=="No internet service",
StreamingMovies=="No internet service",
StreamingTV=="No internet service")]data[, .(Obs = .N, Churn = sum(Churn=="Yes")/.N), .(SeniorCitizen, haveInternet = InternetService!="No")]plot_ly(data, x=~tenure, y=~nMth) FeatureComparison(list(month2month = data[Contract=="Month-to-month"]$tenure,
oneYear = data[Contract=="One year"]$tenure,
twoYear = data[Contract=="Two year"]$tenure), "tenure")The goal is to find variables with:
TotalCharges: we can remove rows (11 rows) / use mean tenure / most frequent tenure (1)sum.data = Summarize(data); sum.datadata.fe = data[!is.na(TotalCharges)][,-1]
# train-test split
set.seed(seed)
index.test = sample(1:nrow(data.fe), nrow(data.fe)%/%20, replace = F)
data.train = data.fe[-index.test]
data.test = data.fe[index.test]
# check to make sure distribution is consistent
FeatureComparison(list(Train = data.train$Churn, Test = data.test$Churn),
vectorName = "Churn",
probability = T)# function
addFeat <- function(d, feature, value) {
newFeat = paste0(feature, "_", make.names(value))
d[get(feature) == value, (newFeat) := 1L]
d[get(feature) != value, (newFeat) := 0L]
d
}
onehot <- function(d, feature, values = NULL) {
if (is.null(values)) {
values = d[,unique(get(feature))]
if (length(values)==2L) values = values[1]
}
d = lapply(1:length(values), function(x) addFeat(d, feature, values[x]))[[1]] %>%
setDT
d[, (feature) := NULL]
d
}
# encode
data.oh = copy(data.fe)
onehot(data.oh, "gender", "male")
onehot(data.oh, "Partner", "Yes")
onehot(data.oh, "Dependents", "Yes")
onehot(data.oh, "PhoneService", "Yes")
onehot(data.oh, "MultipleLines")
onehot(data.oh, "InternetService")
onehot(data.oh, "OnlineSecurity", c("Yes", "No"))
onehot(data.oh, "OnlineBackup", c("Yes", "No"))
onehot(data.oh, "DeviceProtection", c("Yes", "No"))
onehot(data.oh, "TechSupport", c("Yes", "No"))
onehot(data.oh, "StreamingTV", c("Yes", "No"))
onehot(data.oh, "StreamingMovies", c("Yes", "No"))
onehot(data.oh, "Contract")
onehot(data.oh, "PaperlessBilling", "Yes")
onehot(data.oh, "PaymentMethod")
# split
dataOH.train = data.oh[-index.test]
dataOH.test = data.oh[index.test]TBD
xlr = model.matrix(Churn ~ ., data = data.fe)[,-1]
ylr = data.fe$Churn
# build models
buildLR <- function(alpha, nfolds, seed = 2018) {
# alpha = 1 is lasso, alpha = 0 is ridge
set.seed(seed)
model = cv.glmnet(xlr[-index.test,], ylr[-index.test],
alpha = alpha, nfolds = nfolds, family = "binomial")
pred = predict(model, s = model$lambda.min,
newx = xlr[index.test,], type = "response")[,1]
data.table(alpha = alpha, nfolds = nfolds, lambda = model$lambda.min,
auc = CalcAUC(scores = pred, target = data.fe[index.test, Churn=="No"]))
}
# check parameter
plr = purrr::map2_dfr(.x = rep(0:1, each = 18), .y = rep(3:20, 2), buildLR) %>%
setDT %>%
.[alpha==0, regularization := "L2"] %>%
.[alpha==1, regularization := "L1"]
plot_ly(plr, x = ~nfolds, y = ~lambda, color = ~regularization,
colors = c("red", "blue")) %>% add_markers()# build models
set.seed(seed)
mlr.l1 = cv.glmnet(xlr[-index.test,], ylr[-index.test], alpha = 1, nfolds = 10, family = "binomial")
set.seed(seed)
mlr.l2 = cv.glmnet(xlr[-index.test,], ylr[-index.test], alpha = 0, nfolds = 10, family = "binomial")
# check parameter
buildLR2 <- function(alpha, i) {
model = if (alpha==1) mlr.l1 else mlr.l2
pred = predict(model, s = model$lambda[i], newx = xlr[index.test,], type = "response")[,1]
data.table(alpha = alpha, lambda = model$lambda[i],
auc = CalcAUC(scores = pred, target = data.fe[index.test, Churn=="No"]))
}
plr2 = purrr::map2_dfr(.x = rep(1, length(mlr.l1$lambda)),
.y = 1:length(mlr.l1$lambda), buildLR2) %>%
rbind(purrr::map2_dfr(.x = rep(0, length(mlr.l2$lambda)),
.y = 1:length(mlr.l2$lambda), buildLR2)) %>%
setDT %>%
.[alpha==0, regularization := "L2"] %>%
.[alpha==1, regularization := "L1"]
# plot
plot_ly(plr2, y = ~auc, x = ~lambda, color = ~regularization,
colors = c("red", "blue")) %>% add_markers()pred.l1 = predict(mlr.l1, s = mlr.l1$lambda.min, newx = xlr[index.test,], type = "response")[,1]
pred.l2 = predict(mlr.l2, s = mlr.l2$lambda.min, newx = xlr[index.test,], type = "response")[,1]
ModelAUC(scores = data.table(Lasso = pred.l1, Ridge = pred.l2),
target = data.test[, Churn=="No"])coef(mlr.l1)31 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) -0.565314600678356260488
genderMale .
SeniorCitizen 0.182665785268790237250
PartnerYes .
DependentsYes -0.088101429020788643576
tenure -0.030283115549137325229
PhoneServiceYes -0.138780310387376565329
MultipleLinesNo phone service 0.000000000000010537242
MultipleLinesYes 0.086977284729059717305
InternetServiceFiber optic 0.836726328662139917647
InternetServiceNo -0.631798942172894673064
OnlineSecurityNo internet service -0.000018715620585032591
OnlineSecurityYes -0.272436842545352719824
OnlineBackupNo internet service -0.016904375941974417868
OnlineBackupYes .
DeviceProtectionNo internet service -0.000000000000010884362
DeviceProtectionYes .
TechSupportNo internet service -0.000000000000002202788
TechSupportYes -0.215115445968212914973
StreamingTVNo internet service -0.034367247351094684649
StreamingTVYes 0.133702318090771349324
StreamingMoviesNo internet service -0.000014648268145588188
StreamingMoviesYes 0.148588308399303675733
ContractOne year -0.504947348137423257519
ContractTwo year -0.924317946353389818803
PaperlessBillingYes 0.313189272214447589349
PaymentMethodCredit card (automatic) .
PaymentMethodElectronic check 0.374348308394866047255
PaymentMethodMailed check .
MonthlyCharges .
TotalCharges .
selectionFunction allows to change
best, oneSE, tolerance# 10-fold CV
clr <- trainControl(method = "repeatedcv", number = 10, repeats = 5,
classProbs = TRUE, summaryFunction = twoClassSummary)
# search grid
glr <- expand.grid(alpha = 0:10/10, lambda = 10^seq(3,-6))
# model
set.seed(2018)
mlr <- train(as.factor(Churn) ~ ., data = data.train,
method = "glmnet", trControl = clr, tuneGrid = glr, metric = "ROC")
plot(mlr)mlr$bestTune# input
xrf = copy(data.fe)
xrf = xrf[, lapply(.SD, as.factor),
.SDcols = sum.data[Class=="character" & variable!="customerID", as.character(variable)]] %>%
cbind(xrf[, mget(sum.data[Class!="character", as.character(variable)])])
# search grid
grf <- expand.grid(mtry = 1:5*4,
splitrule = c("gini", "extratrees"),
min.node.size = 1:5*20)
# ntree = c(seq(10, 90, 10), seq(100, 900, 100), seq(1000, 10000, 1000)))
# 10-fold CV
set.seed(seed)
crf10 <- trainControl(method = "repeatedcv", number = 10, repeats = 1,
classProbs = TRUE, summaryFunction = twoClassSummary)
# model
set.seed(seed)
mrf <- train(as.factor(Churn) ~ ., data = xrf[-index.test],
method = 'ranger', trControl = crf10, tuneGrid = grf, metric = "ROC")
plot(mrf)mrf$bestTune# 10-fold CV
set.seed(seed)
crf <- trainControl(method = "repeatedcv", number = 10, repeats = 1,
classProbs = TRUE, summaryFunction = twoClassSummary)
# search grid
grf <- expand.grid(mtry = 1:5*4,
splitrule = c("gini", "extratrees"),
min.node.size = 1:5*20)
# model
set.seed(seed)
moh <- train(as.factor(Churn) ~ ., data = dataOH.train,
method = 'ranger', trControl = crf, tuneGrid = grf, metric = "ROC")
plot(moh)moh$bestTunepred.lr = predict(mlr, newdata = data.test, type = "prob")[,2]
pred.rf = predict(mrf, newdata = data.test, type = "prob")[,2]
pred.oh = predict(moh, newdata = dataOH.test, type = "prob")[,2]
ModelAUC(scores = data.table(Lasso = pred.l1, Ridge = pred.l2,
Mixed = pred.lr, RF = pred.rf, RFOneHot = pred.oh),
target = data.test[, Churn=="No"])set.seed(seed)
modelRF = Predictor$new(mrf, data = data.frame(data.train), y = "Churn", type = "prob")
impRF = FeatureImp$new(modelRF, loss = "ce")
plot(impRF) #imp$resultsset.seed(seed)
modelOH = Predictor$new(moh, data = data.frame(dataOH.train), y = "Churn", type = "prob")
impOH = FeatureImp$new(modelOH, loss = "ce")
plot(impOH) #imp$resultspdpRF = FeatureEffect$new(modelRF, method = "pdp", feature = "InternetService")
pdpRF$plot()pdpOH = FeatureEffect$new(modelOH, method = "pdp", feature = "InternetService_Fiber.optic")
pdpOH$plot()pdpOH = FeatureEffect$new(modelOH, method = "pdp", feature = "InternetService_No")
pdpOH$plot()pdpRF$set.feature("tenure"); pdpRF$plot()pdpOH$set.feature("tenure"); pdpOH$plot()pdpRF$set.feature("Contract"); pdpRF$plot()pdpOH$set.feature("Contract_Month.to.month"); pdpOH$plot()pdpRF$set.feature("PaperlessBilling"); pdpRF$plot()pdpRF$set.feature("TotalCharges"); pdpRF$plot()pdpOH$set.feature("TotalCharges"); pdpOH$plot()pdpRF$set.feature("MonthlyCharges"); pdpRF$plot()pdpOH$set.feature("MonthlyCharges"); pdpOH$plot()i = 100
shapleyRF = Shapley$new(modelRF, x.interest = data.frame(data.test[, -"Churn"][i, ]))
shapleyRF$plot()i = 100
shapleyOH = Shapley$new(modelOH, x.interest = data.frame(dataOH.test[, -"Churn"][i, ]))
shapleyOH$plot()save(xlr, ylr, plr, plr2, mlr.l1, mlr.l2, mlr, mrf, moh, file = "model.RData")
load("model.RData")